pyquary库
初始化
-
字符串初始化
- from pyquery import PyQuery as pq
doc = pq(html) #生命pq对象
print(doc('li')) # 获取li标签
-
URL初始化
-
文件初始化
- doc = pq(filename='demo.html') #跟本文件在同一个目录下
print(doc('li'))
基本CSS选择器
- from pyquery import PyQuery as pq
doc = pq(html)
print(doc('#container .list li')) #传递 id ,类,标签之类的
查找元素
-
子元素
- from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list') #先选中了 .list 中的内容
print(type(items))
print(items)
lis = items.find('li') #查找items中的li的标签内容
print(type(lis))
print(lis)
-
查找直接的子元素
- lis = items.children()
print(type(lis))
print(lis)
-
父元素
- from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list') #获取.list的pq对象
container = items.parent() #获取其父元素
print(type(container)) #打印其类型
print(container) #打印内容
-
祖先元素
- from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
parents = items.parents() #查找所有的祖先节点
print(type(parents))
print(parents) #这里返回了两个,一个是最大的,一个是直接的父元素
-
兄弟元素
- from pyquery import PyQuery as pq
doc = pq(html) #首先获取pq对象
li = doc('.list .item-0.active') #加空格后代选择器 没有加空格是并列选择器
print(li.siblings()) #兄弟元素
遍历
-
单个元素
- from pyquery import PyQuery as pq
doc = pq(html)
lis = doc('li').items() #生成一个迭代器
print(type(lis))
for li in lis: #for循环依次遍历
print(li)
获取信息
-
获取属性
- from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')
print(a)
print(a.attr('href')) #使用的是函数
print(a.attr.href) #使用 .
-
获取文本
- from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')
print(a)
print(a.text()) #只要其中包含的文本信息
-
获取HTML
- from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
print(li.html()) #获取了li里面的html
DOM操作
-
addClass、removeClass
- from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.removeClass('active') #删除类
print(li)
li.addClass('active') #添加类
print(li)
-
attr、css
- from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.attr('name', 'link') #添加属性name = link的属性
print(li)
li.css('font-size', '14px') #添加css style属性
print(li)
-
remove
- from pyquery import PyQuery as pq
doc = pq(html)
wrap = doc('.wrap')
print(wrap.text())
wrap.find('p').remove() #把p标签进行删除
print(wrap.text())
其他DOM方法
-
伪类选择器
- from pyquery import PyQuery as pq
doc = pq(html)
li = doc('li:first-child') #li中的第一个li
print(li)
li = doc('li:last-child')
print(li)
li = doc('li:nth-child(2)')
print(li)
li = doc('li:gt(2)')
print(li)
li = doc('li:nth-child(2n)')
print(li)
li = doc('li:contains(second)')
print(li)
官方文档
pyquery
初始化
字符串初始化
1
2
3
4
5
6
7
8
9
10
11
12
13
14 | html = '''
<div>
<ul>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html) #声明pq对象
print(doc('li')) #获取li标签的信息
|
| <li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
|
URL初始化
| from pyquery import PyQuery as pq
doc = pq(url='http://www.baidu.com') #url地址
print(doc('head'))
|
| <head><meta http-equiv="content-type" content="text/html;charset=utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=Edge"/><meta content="always" name="referrer"/><link rel="stylesheet" type="text/css" href="http://s1.bdstatic.com/r/www/cache/bdorz/baidu.min.css"/><title>ç¾åº¦ä¸ä¸ï¼ä½ å°±ç¥é</title></head>
|
文件初始化
| from pyquery import PyQuery as pq
doc = pq(filename='demo.html') #传入本地的html文件
print(doc('li'))
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 | ---------------------------------------------------------------------------
FileNotFoundError Traceback (most recent call last)
<ipython-input-3-0bb1b77a760d> in <module>
1 from pyquery import PyQuery as pq
----> 2 doc = pq(filename='demo.html') #传入本地的html文件
3 print(doc('li'))
D:\Anaconda3\envs\CPU\lib\site-packages\pyquery\pyquery.py in __init__(self, *args, **kwargs)
215 # specific case to get the dom
216 if 'filename' in kwargs:
--> 217 html = open(kwargs['filename'])
218 elif 'url' in kwargs:
219 url = kwargs.pop('url')
FileNotFoundError: [Errno 2] No such file or directory: 'demo.html'
|
基本CSS选择器
1
2
3
4
5
6
7
8
9
10
11
12
13
14 | html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
print(doc('#container .list li')) #传递 id ,类,标签之类的
|
| <li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
|
查找元素
子元素
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 | html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list') #先选中了 .list 中的内容
print(type(items))
print(items)
lis = items.find('li') #查找items中的li的标签内容
print(type(lis)) #每一个都是pq对象,可以链式的查找
print(lis)
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15 | <class 'pyquery.pyquery.PyQuery'>
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
<class 'pyquery.pyquery.PyQuery'>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
|
| lis = items.children()
print(type(lis))
print(lis)
|
| <class 'pyquery.pyquery.PyQuery'>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
|
| lis = items.children('.active')
print(lis)
|
| <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
|
父元素
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 | html = '''
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list') #获取.list的pq对象
container = items.parent() #获取其父元素
print(type(container)) #打印其类型
print(container) #打印内容
|
| <class 'pyquery.pyquery.PyQuery'>
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 | html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
items = doc('.list')
parents = items.parents() #查找所有的祖先节点
print(type(parents))
print(parents) #这里返回了两个,一个是最大的,一个是直接的父元素
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 | <class 'pyquery.pyquery.PyQuery'>
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div><div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
|
| parent = items.parents('.wrap')
print(parent)
|
| <div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
|
兄弟元素
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 | html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html) #首先获取pq对象
li = doc('.list .item-0.active') #加空格后代选择器 没有加空格是并列选择器
print(li.siblings()) #兄弟元素
|
| <li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0">first item</li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 | html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.list .item-0.active')
print(li.siblings('.active'))
|
| <li class="item-1 active"><a href="link4.html">fourth item</a></li>
|
遍历
单个元素
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17 | html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
|
| <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 | html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
lis = doc('li').items() #生成一个迭代器
print(type(lis))
for li in lis: #for循环依次遍历
print(li)
|
| <class 'generator'>
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
|
获取信息
获取属性
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 | html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')
print(a)
print(a.attr('href')) #使用的是函数
print(a.attr.href) #使用 .
|
| <a href="link3.html"><span class="bold">third item</span></a>
link3.html
link3.html
|
获取文本
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 | html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
a = doc('.item-0.active a')
print(a)
print(a.text()) #只要其中包含的文本信息
|
| <a href="link3.html"><span class="bold">third item</span></a>
third item
|
获取HTML
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 | html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
print(li.html()) #获取了li里面的html
|
| <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<a href="link3.html"><span class="bold">third item</span></a>
|
DOM操作
addClass、removeClass
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 | html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.removeClass('active') #删除类
print(li)
li.addClass('active') #添加类
print(li)
|
| <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
|
attr、css
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 | html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('.item-0.active')
print(li)
li.attr('name', 'link') #添加属性name = link的属性
print(li)
li.css('font-size', '14px') #添加css style属性
print(li)
|
| <li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0 active" name="link"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-0 active" name="link" style="font-size: 14px"><a href="link3.html"><span class="bold">third item</span></a></li>
|
remove
1
2
3
4
5
6
7
8
9
10
11
12 | html = '''
<div class="wrap">
Hello, World
<p>This is a paragraph.</p>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
wrap = doc('.wrap')
print(wrap.text())
wrap.find('p').remove() #把p标签进行删除
print(wrap.text())
|
| Hello, World
This is a paragraph.
Hello, World
|
其他DOM方法
http://pyquery.readthedocs.io/en/latest/api.html
伪类选择器
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 | html = '''
<div class="wrap">
<div id="container">
<ul class="list">
<li class="item-0">first item</li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-0 active"><a href="link3.html"><span class="bold">third item</span></a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
</ul>
</div>
</div>
'''
from pyquery import PyQuery as pq
doc = pq(html)
li = doc('li:first-child') #li中的第一个li
print(li)
li = doc('li:last-child')
print(li)
li = doc('li:nth-child(2)')
print(li)
li = doc('li:gt(2)')
print(li)
li = doc('li:nth-child(2n)')
print(li)
li = doc('li:contains(second)')
print(li)
|
1
2
3
4
5
6
7
8
9
10
11
12
13 | <li class="item-0">first item</li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-0"><a href="link5.html">fifth item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
<li class="item-1 active"><a href="link4.html">fourth item</a></li>
<li class="item-1"><a href="link2.html">second item</a></li>
|
更多CSS选择器可以查看
http://www.w3school.com.cn/css/index.asp
官方文档
http://pyquery.readthedocs.io/